TASK 1: EXPLORATORY DATA ANALYSIS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import seaborn as sns
import scipy.stats as st
from sklearn import ensemble, tree, linear_model
import missingno as msno
In [2]:
data = pd.read_csv('COMP 1886 - Coursework dataset.csv')
In [3]:
data.describe()
Out[3]:
17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun 17-Jul 17-Aug 17-Sep 17-Oct ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
count 243.000000 242.000000 243.000000 242.000000 241.000000 243.000000 243.000000 242.000000 243.000000 243.000000 ... 242.000000 243.000000 243.000000 243.000000 243.000000 242.000000 243.000000 243.000000 243.000000 243.000000
mean 3602.798354 2998.161157 3721.646091 3322.500000 3602.593361 3820.534979 3014.855967 3084.256198 2960.370370 3009.032922 ... 2386.694215 2161.790123 2890.061728 2423.477366 2375.493827 2840.516529 2999.794239 2674.444444 2779.670782 2525.329218
std 5116.986073 4050.109781 4913.648323 4330.203757 4632.923305 5029.753757 3894.348692 4014.044595 3834.929216 3996.884943 ... 2611.252824 2461.885697 3118.164010 2569.860261 2421.253680 2802.025690 2888.035657 2543.547703 2618.224805 2370.837347
min 5.000000 5.000000 10.000000 10.000000 10.000000 10.000000 5.000000 5.000000 5.000000 5.000000 ... 10.000000 5.000000 10.000000 10.000000 5.000000 10.000000 10.000000 10.000000 10.000000 5.000000
25% 830.000000 791.250000 1072.500000 906.250000 1015.000000 990.000000 830.000000 845.000000 815.000000 800.000000 ... 635.000000 587.500000 817.500000 702.500000 692.500000 882.500000 952.500000 880.000000 912.500000 847.500000
50% 2215.000000 1845.000000 2385.000000 2122.500000 2320.000000 2370.000000 1925.000000 1935.000000 1835.000000 1900.000000 ... 1622.500000 1290.000000 1870.000000 1635.000000 1645.000000 2095.000000 2325.000000 2090.000000 2200.000000 1930.000000
75% 4222.500000 3792.500000 4645.000000 4123.750000 4540.000000 4657.500000 3822.500000 3943.750000 3750.000000 3777.500000 ... 3345.000000 3107.500000 4107.500000 3520.000000 3490.000000 4085.000000 4337.500000 3802.500000 3995.000000 3610.000000
max 42550.000000 32765.000000 40125.000000 34730.000000 36560.000000 41000.000000 31105.000000 31625.000000 30080.000000 30105.000000 ... 17255.000000 16655.000000 20460.000000 17250.000000 15470.000000 17130.000000 17690.000000 15570.000000 15445.000000 14215.000000

8 rows × 88 columns

In [4]:
data.head()
Out[4]:
Region SOC 2 digit code SOC 2 digit label Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands 11 Corporate managers and directors high 2315 1955.0 2380 1950.0 2210.0 2080 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands 12 Other managers and proprietors low 1420 1240.0 1465 1250.0 1475.0 1555 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands 21 Science, research, engineering and technology ... low 5010 4165.0 5170 4775.0 5130.0 4580 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands 22 Health professionals high 2565 2020.0 2670 2515.0 2890.0 2655 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands 23 Teaching and other educational professionals low 2075 1850.0 2160 1670.0 1940.0 2200 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560

5 rows × 96 columns

In [5]:
data.tail()
Out[5]:
Region SOC 2 digit code SOC 2 digit label Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
238 Yorkshire and The Humber 81 Process, plant and machine operatives low 1635 1600.0 2190 1985.0 2125.0 2175 ... 1385.0 1280 1885 1570 1620 2160.0 2180 2120 2295 2255
239 Yorkshire and The Humber 82 Transport and mobile machine drivers and opera... high 1435 1280.0 1735 1685.0 1870.0 2290 ... 1190.0 845 1265 1200 1145 1850.0 2345 2385 2575 2480
240 Yorkshire and The Humber 91 Elementary trades and related occupations low 490 590.0 845 660.0 855.0 845 ... 375.0 340 555 425 465 740.0 855 775 910 875
241 Yorkshire and The Humber 92 Elementary administration and service occupations high 2610 2960.0 3825 3680.0 3945.0 5215 ... 2840.0 2550 3955 3590 4005 4935.0 4985 4600 4910 4875
242 Yorkshire and The Humber Unknown Unknown high 85 65.0 85 45.0 70.0 55 ... 35.0 35 45 35 30 15.0 35 25 40 25

5 rows × 96 columns

In [6]:
data.shape
Out[6]:
(243, 96)
In [7]:
numeric_features = data.select_dtypes(include=[np.number])

numeric_features.columns
#although it is numeric, the datatype is object which means that there are some strings mixed with numeric data. 
Out[7]:
Index(['17-Jan', '17-Feb', '17-Mar', '17-Apr', '17-May', '17-Jun', '17-Jul',
       '17-Aug', '17-Sep', '17-Oct', '17-Nov', '18-Jan', '18-Feb', '18-Mar',
       '18-Apr', '18-May', '18-Jun', '18-Jul', '18-Aug', '18-Sep', '18-Oct',
       '18-Nov', '18-Dec', '19-Jan', '19-Feb', '19-Mar', '19-Apr', '19-May',
       '19-Jun', '19-Jul', '19-Aug', '19-Sep', '19-Oct', '19-Nov', '19-Dec',
       '20-Jan', '20-Feb', '20-Mar', '20-Jul', '20-Aug', '20-Sep', '20-Oct',
       '20-Nov', '20-Dec', '21-Jan', '21-Feb', '21-Mar', '21-Apr', '21-May',
       '21-Jun', '21-Jul', '21-Aug', '21-Sep', '21-Oct', '21-Nov', '21-Dec',
       '22-Jan', '22-Feb', '22-Mar', '22-Apr', '22-May', '22-Jun', '22-Jul',
       '22-Aug', '22-Sep', '22-Oct', '22-Nov', '22-Dec', '23-Jan', '23-Feb',
       '23-Mar', '23-Apr', '23-May', '23-Jun', '23-Jul', '23-Aug', '23-Sep',
       '23-Oct', '23-Nov', '23-Dec', '24-Jan', '24-Feb', '24-Mar', '24-Apr',
       '24-May', '24-Jun', '24-Jul', '24-Aug'],
      dtype='object')
In [8]:
type(numeric_features)
Out[8]:
pandas.core.frame.DataFrame
In [9]:
categorical_features = data.select_dtypes(include=[object])

categorical_features.columns
#already the 4 columns which are numerical but have strings are showing themselves 
Out[9]:
Index(['Region', 'SOC 2 digit code', 'SOC 2 digit label',
       'Prioritise using SOC 2 as high or low', '17-Dec', '20-Apr', '20-May',
       '20-Jun'],
      dtype='object')
In [10]:
msno.matrix(data,labels=True)
plt.savefig('missing_data_matrix.png', format='png')
No description has been provided for this image
In [11]:
msno.heatmap(data)
Out[11]:
<Axes: >
No description has been provided for this image
In [12]:
msno.bar(data.sample(243))
Out[12]:
<Axes: >
No description has been provided for this image
In [13]:
print(numeric_features.corr())
          17-Jan    17-Feb    17-Mar    17-Apr    17-May    17-Jun    17-Jul  \
17-Jan  1.000000  0.996856  0.994637  0.992206  0.990051  0.984449  0.982328   
17-Feb  0.996856  1.000000  0.997807  0.996578  0.995337  0.992016  0.988955   
17-Mar  0.994637  0.997807  1.000000  0.997723  0.995802  0.993270  0.991763   
17-Apr  0.992206  0.996578  0.997723  1.000000  0.998192  0.993655  0.994135   
17-May  0.990051  0.995337  0.995802  0.998192  1.000000  0.992588  0.991493   
...          ...       ...       ...       ...       ...       ...       ...   
24-Apr  0.871993  0.891399  0.899478  0.903340  0.910326  0.916796  0.910697   
24-May  0.857546  0.878374  0.887273  0.891128  0.899358  0.907590  0.899278   
24-Jun  0.851153  0.872888  0.881709  0.885590  0.893656  0.903289  0.895137   
24-Jul  0.847125  0.869292  0.880331  0.885086  0.892163  0.901687  0.898412   
24-Aug  0.841447  0.863619  0.876167  0.883161  0.888186  0.899494  0.900648   

          17-Aug    17-Sep    17-Oct  ...    23-Nov    23-Dec    24-Jan  \
17-Jan  0.985240  0.987182  0.987912  ...  0.927033  0.939611  0.925855   
17-Feb  0.990122  0.992942  0.992762  ...  0.939857  0.951115  0.938736   
17-Mar  0.993204  0.995146  0.992810  ...  0.944329  0.952488  0.943420   
17-Apr  0.995184  0.996521  0.996355  ...  0.946051  0.955856  0.944597   
17-May  0.992742  0.993874  0.993911  ...  0.952116  0.961819  0.950324   
...          ...       ...       ...  ...       ...       ...       ...   
24-Apr  0.902592  0.909408  0.894547  ...  0.980742  0.967023  0.985081   
24-May  0.889433  0.897713  0.880892  ...  0.972006  0.955836  0.975700   
24-Jun  0.884872  0.892956  0.875142  ...  0.968932  0.952137  0.972467   
24-Jul  0.888898  0.894256  0.876668  ...  0.967788  0.947585  0.971418   
24-Aug  0.891015  0.894370  0.876512  ...  0.956763  0.933797  0.959790   

          24-Feb    24-Mar    24-Apr    24-May    24-Jun    24-Jul    24-Aug  
17-Jan  0.915499  0.898250  0.871993  0.857546  0.851153  0.847125  0.841447  
17-Feb  0.930257  0.915364  0.891399  0.878374  0.872888  0.869292  0.863619  
17-Mar  0.936030  0.921011  0.899478  0.887273  0.881709  0.880331  0.876167  
17-Apr  0.937474  0.924246  0.903340  0.891128  0.885590  0.885086  0.883161  
17-May  0.943483  0.931150  0.910326  0.899358  0.893656  0.892163  0.888186  
...          ...       ...       ...       ...       ...       ...       ...  
24-Apr  0.991742  0.995509  1.000000  0.996796  0.995417  0.993698  0.984955  
24-May  0.985184  0.991221  0.996796  1.000000  0.997988  0.994375  0.985115  
24-Jun  0.982019  0.988763  0.995417  0.997988  1.000000  0.995778  0.986195  
24-Jul  0.978710  0.984068  0.993698  0.994375  0.995778  1.000000  0.995106  
24-Aug  0.966811  0.971783  0.984955  0.985115  0.986195  0.995106  1.000000  

[88 rows x 88 columns]
In [14]:
data.skew(axis = 0, numeric_only = True).to_csv('skewness.csv', index=True)
data.skew(axis = 0, numeric_only = True)
#data is extremely skewed positively, most values are clustered towards the lower end of the distribution.
Out[14]:
17-Jan    4.078157
17-Feb    3.942626
17-Mar    3.846692
17-Apr    3.850048
17-May    3.700706
            ...   
24-Apr    1.854467
24-May    1.802054
24-Jun    1.771540
24-Jul    1.709164
24-Aug    1.736313
Length: 88, dtype: float64
In [15]:
data.kurt(axis= 0, numeric_only=True ).to_csv('kurtosis.csv', index=True)
data.kurt(axis= 0, numeric_only=True )
#In a positively skewed distribution, the tail on the right side is longer, indicating that the data is concentrated towards the lower end of the distribution with a few larger values pulling the tail out to the righ
Out[15]:
17-Jan    22.840545
17-Feb    21.553227
17-Mar    20.762511
17-Apr    20.901203
17-May    19.483174
            ...    
24-Apr     4.627204
24-May     4.306698
24-Jun     4.204314
24-Jul     3.803149
24-Aug     4.073068
Length: 88, dtype: float64
In [16]:
print(numeric_features.corr())
          17-Jan    17-Feb    17-Mar    17-Apr    17-May    17-Jun    17-Jul  \
17-Jan  1.000000  0.996856  0.994637  0.992206  0.990051  0.984449  0.982328   
17-Feb  0.996856  1.000000  0.997807  0.996578  0.995337  0.992016  0.988955   
17-Mar  0.994637  0.997807  1.000000  0.997723  0.995802  0.993270  0.991763   
17-Apr  0.992206  0.996578  0.997723  1.000000  0.998192  0.993655  0.994135   
17-May  0.990051  0.995337  0.995802  0.998192  1.000000  0.992588  0.991493   
...          ...       ...       ...       ...       ...       ...       ...   
24-Apr  0.871993  0.891399  0.899478  0.903340  0.910326  0.916796  0.910697   
24-May  0.857546  0.878374  0.887273  0.891128  0.899358  0.907590  0.899278   
24-Jun  0.851153  0.872888  0.881709  0.885590  0.893656  0.903289  0.895137   
24-Jul  0.847125  0.869292  0.880331  0.885086  0.892163  0.901687  0.898412   
24-Aug  0.841447  0.863619  0.876167  0.883161  0.888186  0.899494  0.900648   

          17-Aug    17-Sep    17-Oct  ...    23-Nov    23-Dec    24-Jan  \
17-Jan  0.985240  0.987182  0.987912  ...  0.927033  0.939611  0.925855   
17-Feb  0.990122  0.992942  0.992762  ...  0.939857  0.951115  0.938736   
17-Mar  0.993204  0.995146  0.992810  ...  0.944329  0.952488  0.943420   
17-Apr  0.995184  0.996521  0.996355  ...  0.946051  0.955856  0.944597   
17-May  0.992742  0.993874  0.993911  ...  0.952116  0.961819  0.950324   
...          ...       ...       ...  ...       ...       ...       ...   
24-Apr  0.902592  0.909408  0.894547  ...  0.980742  0.967023  0.985081   
24-May  0.889433  0.897713  0.880892  ...  0.972006  0.955836  0.975700   
24-Jun  0.884872  0.892956  0.875142  ...  0.968932  0.952137  0.972467   
24-Jul  0.888898  0.894256  0.876668  ...  0.967788  0.947585  0.971418   
24-Aug  0.891015  0.894370  0.876512  ...  0.956763  0.933797  0.959790   

          24-Feb    24-Mar    24-Apr    24-May    24-Jun    24-Jul    24-Aug  
17-Jan  0.915499  0.898250  0.871993  0.857546  0.851153  0.847125  0.841447  
17-Feb  0.930257  0.915364  0.891399  0.878374  0.872888  0.869292  0.863619  
17-Mar  0.936030  0.921011  0.899478  0.887273  0.881709  0.880331  0.876167  
17-Apr  0.937474  0.924246  0.903340  0.891128  0.885590  0.885086  0.883161  
17-May  0.943483  0.931150  0.910326  0.899358  0.893656  0.892163  0.888186  
...          ...       ...       ...       ...       ...       ...       ...  
24-Apr  0.991742  0.995509  1.000000  0.996796  0.995417  0.993698  0.984955  
24-May  0.985184  0.991221  0.996796  1.000000  0.997988  0.994375  0.985115  
24-Jun  0.982019  0.988763  0.995417  0.997988  1.000000  0.995778  0.986195  
24-Jul  0.978710  0.984068  0.993698  0.994375  0.995778  1.000000  0.995106  
24-Aug  0.966811  0.971783  0.984955  0.985115  0.986195  0.995106  1.000000  

[88 rows x 88 columns]
In [17]:
correlation = numeric_features.corr()
correlation.info()
<class 'pandas.core.frame.DataFrame'>
Index: 88 entries, 17-Jan to 24-Aug
Data columns (total 88 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   17-Jan  88 non-null     float64
 1   17-Feb  88 non-null     float64
 2   17-Mar  88 non-null     float64
 3   17-Apr  88 non-null     float64
 4   17-May  88 non-null     float64
 5   17-Jun  88 non-null     float64
 6   17-Jul  88 non-null     float64
 7   17-Aug  88 non-null     float64
 8   17-Sep  88 non-null     float64
 9   17-Oct  88 non-null     float64
 10  17-Nov  88 non-null     float64
 11  18-Jan  88 non-null     float64
 12  18-Feb  88 non-null     float64
 13  18-Mar  88 non-null     float64
 14  18-Apr  88 non-null     float64
 15  18-May  88 non-null     float64
 16  18-Jun  88 non-null     float64
 17  18-Jul  88 non-null     float64
 18  18-Aug  88 non-null     float64
 19  18-Sep  88 non-null     float64
 20  18-Oct  88 non-null     float64
 21  18-Nov  88 non-null     float64
 22  18-Dec  88 non-null     float64
 23  19-Jan  88 non-null     float64
 24  19-Feb  88 non-null     float64
 25  19-Mar  88 non-null     float64
 26  19-Apr  88 non-null     float64
 27  19-May  88 non-null     float64
 28  19-Jun  88 non-null     float64
 29  19-Jul  88 non-null     float64
 30  19-Aug  88 non-null     float64
 31  19-Sep  88 non-null     float64
 32  19-Oct  88 non-null     float64
 33  19-Nov  88 non-null     float64
 34  19-Dec  88 non-null     float64
 35  20-Jan  88 non-null     float64
 36  20-Feb  88 non-null     float64
 37  20-Mar  88 non-null     float64
 38  20-Jul  88 non-null     float64
 39  20-Aug  88 non-null     float64
 40  20-Sep  88 non-null     float64
 41  20-Oct  88 non-null     float64
 42  20-Nov  88 non-null     float64
 43  20-Dec  88 non-null     float64
 44  21-Jan  88 non-null     float64
 45  21-Feb  88 non-null     float64
 46  21-Mar  88 non-null     float64
 47  21-Apr  88 non-null     float64
 48  21-May  88 non-null     float64
 49  21-Jun  88 non-null     float64
 50  21-Jul  88 non-null     float64
 51  21-Aug  88 non-null     float64
 52  21-Sep  88 non-null     float64
 53  21-Oct  88 non-null     float64
 54  21-Nov  88 non-null     float64
 55  21-Dec  88 non-null     float64
 56  22-Jan  88 non-null     float64
 57  22-Feb  88 non-null     float64
 58  22-Mar  88 non-null     float64
 59  22-Apr  88 non-null     float64
 60  22-May  88 non-null     float64
 61  22-Jun  88 non-null     float64
 62  22-Jul  88 non-null     float64
 63  22-Aug  88 non-null     float64
 64  22-Sep  88 non-null     float64
 65  22-Oct  88 non-null     float64
 66  22-Nov  88 non-null     float64
 67  22-Dec  88 non-null     float64
 68  23-Jan  88 non-null     float64
 69  23-Feb  88 non-null     float64
 70  23-Mar  88 non-null     float64
 71  23-Apr  88 non-null     float64
 72  23-May  88 non-null     float64
 73  23-Jun  88 non-null     float64
 74  23-Jul  88 non-null     float64
 75  23-Aug  88 non-null     float64
 76  23-Sep  88 non-null     float64
 77  23-Oct  88 non-null     float64
 78  23-Nov  88 non-null     float64
 79  23-Dec  88 non-null     float64
 80  24-Jan  88 non-null     float64
 81  24-Feb  88 non-null     float64
 82  24-Mar  88 non-null     float64
 83  24-Apr  88 non-null     float64
 84  24-May  88 non-null     float64
 85  24-Jun  88 non-null     float64
 86  24-Jul  88 non-null     float64
 87  24-Aug  88 non-null     float64
dtypes: float64(88)
memory usage: 63.2+ KB
In [18]:
f,ax = plt.subplots(figsize = (14,12))
#plt.title('Correlation of Numeric Features with Sale Price',y=1,size=16)
sns.heatmap(correlation)
#does not give much information consider removing
Out[18]:
<Axes: >
No description has been provided for this image
In [19]:
f,ax = plt.subplots(figsize = (40,30))
plt.title('Correlation of Numeric Features with each other',y=1,size=20)
sns.heatmap(correlation,annot=True)
Out[19]:
<Axes: title={'center': 'Correlation of Numeric Features with each other'}>
No description has been provided for this image
In [20]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure numeric data
numeric_data = data.loc[:, '17-Jan':'24-Aug'].apply(pd.to_numeric, errors='coerce')

# Sum values across all regions for each month
overall_trend = numeric_data.sum(axis=0)  # Summing across rows (regions)

# Plot the overall trend with a larger figure size
plt.figure(figsize=(20, 12))  # Adjusting the graph size: width = 15 inches, height = 8 inches
# Customize the y-axis to show standard numbers
plt.plot(overall_trend.index, overall_trend.values, marker='o', color='green')
plt.title('Overall Monthly Job Advertisement Trends Across All Regions', fontsize=16)
plt.xlabel('Month-Year', fontsize=12)
plt.ylabel('Total Advertisements', fontsize=12)
plt.xticks(rotation=60, fontsize=8)  # Rotate x-axis labels for better readability
plt.grid(True)

# Save the figure
plt.savefig('overall_trend_large.png', format='png')

# Show the plot
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [21]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Ensure numeric data by coercing non-numeric values to NaN
numeric_data = data.loc[:, '17-Jan':'24-Aug'].apply(pd.to_numeric, errors='coerce')

# Step 2: Define yearly columns dynamically
years = ['2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
yearly_columns = {
    year: [col for col in numeric_data.columns if col.startswith(year[-2:])]
    for year in years
}

# Step 3: Create a new DataFrame for yearly totals
yearly_totals = pd.DataFrame({
    year: numeric_data[columns].sum(axis=1) for year, columns in yearly_columns.items()
})

# Step 4: Add SOC labels for reference
yearly_totals['SOC 2 digit label'] = data['SOC 2 digit label']

# Step 5: Group by SOC label and sum advertisements for each year
soc_yearly_totals = yearly_totals.groupby('SOC 2 digit label').sum()

# Step 6: Set up an expanded figure area for better visualization
plt.figure(figsize=(20, 12))

# Step 7: Plot the data and explicitly provide the column labels as the legend
ax = soc_yearly_totals.drop(columns=['SOC 2 digit label'], errors='ignore').plot(
    kind='bar', colormap='viridis', figsize=(20, 12), width=0.8
)

# Step 8: Customize plot appearance
plt.title('Yearly Advertisements by SOC Label', fontsize=18)
plt.xlabel('SOC 2 Digit Label', fontsize=14)
plt.ylabel('Total Advertisements', fontsize=14)
plt.xticks(rotation=90, fontsize=12)
plt.legend(title='Year', fontsize=12, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('Yearly Advertisements by SOC Label.png', format='png')
# Step 9: Show the plot
plt.show()
<Figure size 2000x1200 with 0 Axes>
No description has been provided for this image
In [22]:
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

# Coerce non-numeric values to NaN for the relevant columns
numeric_data = data.loc[:, '17-Jan':'24-Aug'].apply(pd.to_numeric, errors='coerce')

# Group by 'Region' and sum only numeric data
region_totals = numeric_data.groupby(data['Region']).sum()

# Sum across all months for each region
region_totals_sum = region_totals.sum(axis=1)

# Plot the total contributions by region
region_totals_sum.plot(kind='bar', figsize=(10, 6), color='royalblue')

# Customize the y-axis to show standard numbers
plt.gca().yaxis.set_major_formatter(ScalarFormatter())  # Change scientific notation to regular numbers
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{int(x):,}'))  # Add commas for readability

# Add chart labels and formatting
plt.title('Total Advertisements by Region')
plt.xlabel('Region')
plt.ylabel('Total Advertisements')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('Total contr.png', format='png')
plt.show()
No description has been provided for this image
In [23]:
# Coerce non-numeric values to NaN for the relevant columns
data.loc[:, data.columns[4:]] = data.loc[:, data.columns[4:]].apply(pd.to_numeric, errors='coerce')

# Melt the DataFrame to prepare for boxplot visualization
melted_data = data.melt(id_vars=['SOC 2 digit label'], value_vars=data.columns[4:])

# Plot the boxplot with only numeric values included
plt.figure(figsize=(12, 6))
sns.boxplot(x='SOC 2 digit label', y='value', data=melted_data)
plt.title('Distribution of Values Across SOC Labels')
plt.xlabel('SOC 2 Digit Label')
plt.ylabel('Values')
plt.xticks(rotation=90, fontsize=8)
plt.savefig('Distribution by SOC Label.png', format='png', bbox_inches='tight')
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 

TASK 2: DATA PREPROCESSING

In [24]:
data.head()
Out[24]:
Region SOC 2 digit code SOC 2 digit label Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands 11 Corporate managers and directors high 2315 1955.0 2380 1950.0 2210.0 2080 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands 12 Other managers and proprietors low 1420 1240.0 1465 1250.0 1475.0 1555 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands 21 Science, research, engineering and technology ... low 5010 4165.0 5170 4775.0 5130.0 4580 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands 22 Health professionals high 2565 2020.0 2670 2515.0 2890.0 2655 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands 23 Teaching and other educational professionals low 2075 1850.0 2160 1670.0 1940.0 2200 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560

5 rows × 96 columns

In [25]:
data.isnull().sum()
Out[25]:
Region                                    0
SOC 2 digit code                          0
SOC 2 digit label                         0
Prioritise using SOC 2 as high or low    29
17-Jan                                    0
                                         ..
24-Apr                                    1
24-May                                    0
24-Jun                                    0
24-Jul                                    0
24-Aug                                    0
Length: 96, dtype: int64
In [26]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 96 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Region                                 243 non-null    object 
 1   SOC 2 digit code                       243 non-null    object 
 2   SOC 2 digit label                      243 non-null    object 
 3   Prioritise using SOC 2 as high or low  214 non-null    object 
 4   17-Jan                                 243 non-null    int64  
 5   17-Feb                                 242 non-null    float64
 6   17-Mar                                 243 non-null    int64  
 7   17-Apr                                 242 non-null    float64
 8   17-May                                 241 non-null    float64
 9   17-Jun                                 243 non-null    int64  
 10  17-Jul                                 243 non-null    int64  
 11  17-Aug                                 242 non-null    float64
 12  17-Sep                                 243 non-null    int64  
 13  17-Oct                                 243 non-null    int64  
 14  17-Nov                                 242 non-null    float64
 15  17-Dec                                 242 non-null    object 
 16  18-Jan                                 243 non-null    int64  
 17  18-Feb                                 243 non-null    int64  
 18  18-Mar                                 243 non-null    int64  
 19  18-Apr                                 240 non-null    float64
 20  18-May                                 243 non-null    int64  
 21  18-Jun                                 243 non-null    int64  
 22  18-Jul                                 243 non-null    int64  
 23  18-Aug                                 243 non-null    int64  
 24  18-Sep                                 242 non-null    float64
 25  18-Oct                                 243 non-null    int64  
 26  18-Nov                                 243 non-null    int64  
 27  18-Dec                                 243 non-null    int64  
 28  19-Jan                                 243 non-null    int64  
 29  19-Feb                                 243 non-null    int64  
 30  19-Mar                                 242 non-null    float64
 31  19-Apr                                 243 non-null    int64  
 32  19-May                                 243 non-null    int64  
 33  19-Jun                                 242 non-null    float64
 34  19-Jul                                 243 non-null    int64  
 35  19-Aug                                 243 non-null    int64  
 36  19-Sep                                 243 non-null    int64  
 37  19-Oct                                 242 non-null    float64
 38  19-Nov                                 243 non-null    int64  
 39  19-Dec                                 243 non-null    int64  
 40  20-Jan                                 243 non-null    int64  
 41  20-Feb                                 242 non-null    float64
 42  20-Mar                                 243 non-null    int64  
 43  20-Apr                                 242 non-null    object 
 44  20-May                                 240 non-null    object 
 45  20-Jun                                 242 non-null    object 
 46  20-Jul                                 243 non-null    int64  
 47  20-Aug                                 243 non-null    int64  
 48  20-Sep                                 243 non-null    int64  
 49  20-Oct                                 241 non-null    float64
 50  20-Nov                                 243 non-null    int64  
 51  20-Dec                                 243 non-null    int64  
 52  21-Jan                                 242 non-null    float64
 53  21-Feb                                 243 non-null    int64  
 54  21-Mar                                 243 non-null    int64  
 55  21-Apr                                 242 non-null    float64
 56  21-May                                 243 non-null    int64  
 57  21-Jun                                 243 non-null    int64  
 58  21-Jul                                 243 non-null    int64  
 59  21-Aug                                 242 non-null    float64
 60  21-Sep                                 243 non-null    int64  
 61  21-Oct                                 243 non-null    int64  
 62  21-Nov                                 242 non-null    float64
 63  21-Dec                                 243 non-null    int64  
 64  22-Jan                                 243 non-null    int64  
 65  22-Feb                                 242 non-null    float64
 66  22-Mar                                 243 non-null    int64  
 67  22-Apr                                 243 non-null    int64  
 68  22-May                                 242 non-null    float64
 69  22-Jun                                 243 non-null    int64  
 70  22-Jul                                 243 non-null    int64  
 71  22-Aug                                 242 non-null    float64
 72  22-Sep                                 243 non-null    int64  
 73  22-Oct                                 243 non-null    int64  
 74  22-Nov                                 242 non-null    float64
 75  22-Dec                                 243 non-null    int64  
 76  23-Jan                                 241 non-null    float64
 77  23-Feb                                 243 non-null    int64  
 78  23-Mar                                 242 non-null    float64
 79  23-Apr                                 243 non-null    int64  
 80  23-May                                 242 non-null    float64
 81  23-Jun                                 243 non-null    int64  
 82  23-Jul                                 243 non-null    int64  
 83  23-Aug                                 242 non-null    float64
 84  23-Sep                                 243 non-null    int64  
 85  23-Oct                                 242 non-null    float64
 86  23-Nov                                 242 non-null    float64
 87  23-Dec                                 243 non-null    int64  
 88  24-Jan                                 243 non-null    int64  
 89  24-Feb                                 243 non-null    int64  
 90  24-Mar                                 243 non-null    int64  
 91  24-Apr                                 242 non-null    float64
 92  24-May                                 243 non-null    int64  
 93  24-Jun                                 243 non-null    int64  
 94  24-Jul                                 243 non-null    int64  
 95  24-Aug                                 243 non-null    int64  
dtypes: float64(27), int64(61), object(8)
memory usage: 182.4+ KB
In [27]:
# Create a copy of the original DataFrame
new_data = data.copy()

# List of columns you want to check and replace '[x]' with 0
columns_to_check = ['17-Dec', '20-Apr', '20-May', '20-Jun']  # Replace this with your specific columns

# Replace '[x]' with 0 in the specified columns of the new DataFrame
for col in columns_to_check:
    # Ensure the replacement is successful
    new_data[col] = new_data[col].replace('[x]', 0)

# Verify changes by displaying the updated columns
print(new_data[columns_to_check])

#save the new DataFrame to a CSV file, you can use:
new_data.to_csv('updated_data.csv', index=False)
#I replaced the [x] with 0, as it is logical, the dataset has both the [x] and empty datapoints so logically, i replaced the [x] with 0 and below, filled the empty with the mean of the column
     17-Dec  20-Apr  20-May  20-Jun
0    1330.0  1185.0  1175.0  1195.0
1     780.0   900.0   780.0   925.0
2    3375.0  3100.0  2370.0  2765.0
3    1990.0  3145.0  2865.0  2955.0
4    1225.0  1605.0  1465.0  1315.0
..      ...     ...     ...     ...
238  1115.0  1170.0   950.0  1225.0
239  1180.0   890.0   790.0  1160.0
240   350.0   295.0   310.0   410.0
241  1845.0  2000.0  1730.0  2040.0
242    45.0    55.0    30.0    55.0

[243 rows x 4 columns]
In [28]:
one =pd.read_csv('updated_data.csv')
one.head(7)
Out[28]:
Region SOC 2 digit code SOC 2 digit label Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands 11 Corporate managers and directors high 2315 1955.0 2380 1950.0 2210.0 2080 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands 12 Other managers and proprietors low 1420 1240.0 1465 1250.0 1475.0 1555 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands 21 Science, research, engineering and technology ... low 5010 4165.0 5170 4775.0 5130.0 4580 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands 22 Health professionals high 2565 2020.0 2670 2515.0 2890.0 2655 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands 23 Teaching and other educational professionals low 2075 1850.0 2160 1670.0 1940.0 2200 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560
5 East Midlands 24 Business, media and public service professionals NaN 4005 3540.0 4215 3745.0 4155.0 4365 ... 2950.0 2745 3635 2800 2585 2935.0 3020 2530 2725 2485
6 East Midlands 31 Science, engineering and technology associate ... NaN 2215 2125.0 2560 2070.0 2320.0 2150 ... 1570.0 1590 2025 1630 1655 1845.0 1965 1840 1850 1745

7 rows × 96 columns

In [29]:
one.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 96 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Region                                 243 non-null    object 
 1   SOC 2 digit code                       243 non-null    object 
 2   SOC 2 digit label                      243 non-null    object 
 3   Prioritise using SOC 2 as high or low  214 non-null    object 
 4   17-Jan                                 243 non-null    int64  
 5   17-Feb                                 242 non-null    float64
 6   17-Mar                                 243 non-null    int64  
 7   17-Apr                                 242 non-null    float64
 8   17-May                                 241 non-null    float64
 9   17-Jun                                 243 non-null    int64  
 10  17-Jul                                 243 non-null    int64  
 11  17-Aug                                 242 non-null    float64
 12  17-Sep                                 243 non-null    int64  
 13  17-Oct                                 243 non-null    int64  
 14  17-Nov                                 242 non-null    float64
 15  17-Dec                                 242 non-null    float64
 16  18-Jan                                 243 non-null    int64  
 17  18-Feb                                 243 non-null    int64  
 18  18-Mar                                 243 non-null    int64  
 19  18-Apr                                 240 non-null    float64
 20  18-May                                 243 non-null    int64  
 21  18-Jun                                 243 non-null    int64  
 22  18-Jul                                 243 non-null    int64  
 23  18-Aug                                 243 non-null    int64  
 24  18-Sep                                 242 non-null    float64
 25  18-Oct                                 243 non-null    int64  
 26  18-Nov                                 243 non-null    int64  
 27  18-Dec                                 243 non-null    int64  
 28  19-Jan                                 243 non-null    int64  
 29  19-Feb                                 243 non-null    int64  
 30  19-Mar                                 242 non-null    float64
 31  19-Apr                                 243 non-null    int64  
 32  19-May                                 243 non-null    int64  
 33  19-Jun                                 242 non-null    float64
 34  19-Jul                                 243 non-null    int64  
 35  19-Aug                                 243 non-null    int64  
 36  19-Sep                                 243 non-null    int64  
 37  19-Oct                                 242 non-null    float64
 38  19-Nov                                 243 non-null    int64  
 39  19-Dec                                 243 non-null    int64  
 40  20-Jan                                 243 non-null    int64  
 41  20-Feb                                 242 non-null    float64
 42  20-Mar                                 243 non-null    int64  
 43  20-Apr                                 242 non-null    float64
 44  20-May                                 240 non-null    float64
 45  20-Jun                                 242 non-null    float64
 46  20-Jul                                 243 non-null    int64  
 47  20-Aug                                 243 non-null    int64  
 48  20-Sep                                 243 non-null    int64  
 49  20-Oct                                 241 non-null    float64
 50  20-Nov                                 243 non-null    int64  
 51  20-Dec                                 243 non-null    int64  
 52  21-Jan                                 242 non-null    float64
 53  21-Feb                                 243 non-null    int64  
 54  21-Mar                                 243 non-null    int64  
 55  21-Apr                                 242 non-null    float64
 56  21-May                                 243 non-null    int64  
 57  21-Jun                                 243 non-null    int64  
 58  21-Jul                                 243 non-null    int64  
 59  21-Aug                                 242 non-null    float64
 60  21-Sep                                 243 non-null    int64  
 61  21-Oct                                 243 non-null    int64  
 62  21-Nov                                 242 non-null    float64
 63  21-Dec                                 243 non-null    int64  
 64  22-Jan                                 243 non-null    int64  
 65  22-Feb                                 242 non-null    float64
 66  22-Mar                                 243 non-null    int64  
 67  22-Apr                                 243 non-null    int64  
 68  22-May                                 242 non-null    float64
 69  22-Jun                                 243 non-null    int64  
 70  22-Jul                                 243 non-null    int64  
 71  22-Aug                                 242 non-null    float64
 72  22-Sep                                 243 non-null    int64  
 73  22-Oct                                 243 non-null    int64  
 74  22-Nov                                 242 non-null    float64
 75  22-Dec                                 243 non-null    int64  
 76  23-Jan                                 241 non-null    float64
 77  23-Feb                                 243 non-null    int64  
 78  23-Mar                                 242 non-null    float64
 79  23-Apr                                 243 non-null    int64  
 80  23-May                                 242 non-null    float64
 81  23-Jun                                 243 non-null    int64  
 82  23-Jul                                 243 non-null    int64  
 83  23-Aug                                 242 non-null    float64
 84  23-Sep                                 243 non-null    int64  
 85  23-Oct                                 242 non-null    float64
 86  23-Nov                                 242 non-null    float64
 87  23-Dec                                 243 non-null    int64  
 88  24-Jan                                 243 non-null    int64  
 89  24-Feb                                 243 non-null    int64  
 90  24-Mar                                 243 non-null    int64  
 91  24-Apr                                 242 non-null    float64
 92  24-May                                 243 non-null    int64  
 93  24-Jun                                 243 non-null    int64  
 94  24-Jul                                 243 non-null    int64  
 95  24-Aug                                 243 non-null    int64  
dtypes: float64(31), int64(61), object(4)
memory usage: 182.4+ KB
In [30]:
# Select all columns starting from the 5th column (index 4)
columns_to_fill = one.columns[4:]

# Fill empty (NaN) values with the median of each column
for col in columns_to_fill:
    one[col] = one[col].fillna(one[col].median())

# Save the updated DataFrame as a new DataFrame
new_df = one.copy()
#save as csv
new_df.to_csv('new_data_one.csv', index=False)
# Display the new DataFrame
new_df.head(7)
Out[30]:
Region SOC 2 digit code SOC 2 digit label Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands 11 Corporate managers and directors high 2315 1955.0 2380 1950.0 2210.0 2080 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands 12 Other managers and proprietors low 1420 1240.0 1465 1250.0 1475.0 1555 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands 21 Science, research, engineering and technology ... low 5010 4165.0 5170 4775.0 5130.0 4580 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands 22 Health professionals high 2565 2020.0 2670 2515.0 2890.0 2655 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands 23 Teaching and other educational professionals low 2075 1850.0 2160 1670.0 1940.0 2200 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560
5 East Midlands 24 Business, media and public service professionals NaN 4005 3540.0 4215 3745.0 4155.0 4365 ... 2950.0 2745 3635 2800 2585 2935.0 3020 2530 2725 2485
6 East Midlands 31 Science, engineering and technology associate ... NaN 2215 2125.0 2560 2070.0 2320.0 2150 ... 1570.0 1590 2025 1630 1655 1845.0 1965 1840 1850 1745

7 rows × 96 columns

In [31]:
new_df.info()
#Now, there are no missing numeric values. 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 96 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Region                                 243 non-null    object 
 1   SOC 2 digit code                       243 non-null    object 
 2   SOC 2 digit label                      243 non-null    object 
 3   Prioritise using SOC 2 as high or low  214 non-null    object 
 4   17-Jan                                 243 non-null    int64  
 5   17-Feb                                 243 non-null    float64
 6   17-Mar                                 243 non-null    int64  
 7   17-Apr                                 243 non-null    float64
 8   17-May                                 243 non-null    float64
 9   17-Jun                                 243 non-null    int64  
 10  17-Jul                                 243 non-null    int64  
 11  17-Aug                                 243 non-null    float64
 12  17-Sep                                 243 non-null    int64  
 13  17-Oct                                 243 non-null    int64  
 14  17-Nov                                 243 non-null    float64
 15  17-Dec                                 243 non-null    float64
 16  18-Jan                                 243 non-null    int64  
 17  18-Feb                                 243 non-null    int64  
 18  18-Mar                                 243 non-null    int64  
 19  18-Apr                                 243 non-null    float64
 20  18-May                                 243 non-null    int64  
 21  18-Jun                                 243 non-null    int64  
 22  18-Jul                                 243 non-null    int64  
 23  18-Aug                                 243 non-null    int64  
 24  18-Sep                                 243 non-null    float64
 25  18-Oct                                 243 non-null    int64  
 26  18-Nov                                 243 non-null    int64  
 27  18-Dec                                 243 non-null    int64  
 28  19-Jan                                 243 non-null    int64  
 29  19-Feb                                 243 non-null    int64  
 30  19-Mar                                 243 non-null    float64
 31  19-Apr                                 243 non-null    int64  
 32  19-May                                 243 non-null    int64  
 33  19-Jun                                 243 non-null    float64
 34  19-Jul                                 243 non-null    int64  
 35  19-Aug                                 243 non-null    int64  
 36  19-Sep                                 243 non-null    int64  
 37  19-Oct                                 243 non-null    float64
 38  19-Nov                                 243 non-null    int64  
 39  19-Dec                                 243 non-null    int64  
 40  20-Jan                                 243 non-null    int64  
 41  20-Feb                                 243 non-null    float64
 42  20-Mar                                 243 non-null    int64  
 43  20-Apr                                 243 non-null    float64
 44  20-May                                 243 non-null    float64
 45  20-Jun                                 243 non-null    float64
 46  20-Jul                                 243 non-null    int64  
 47  20-Aug                                 243 non-null    int64  
 48  20-Sep                                 243 non-null    int64  
 49  20-Oct                                 243 non-null    float64
 50  20-Nov                                 243 non-null    int64  
 51  20-Dec                                 243 non-null    int64  
 52  21-Jan                                 243 non-null    float64
 53  21-Feb                                 243 non-null    int64  
 54  21-Mar                                 243 non-null    int64  
 55  21-Apr                                 243 non-null    float64
 56  21-May                                 243 non-null    int64  
 57  21-Jun                                 243 non-null    int64  
 58  21-Jul                                 243 non-null    int64  
 59  21-Aug                                 243 non-null    float64
 60  21-Sep                                 243 non-null    int64  
 61  21-Oct                                 243 non-null    int64  
 62  21-Nov                                 243 non-null    float64
 63  21-Dec                                 243 non-null    int64  
 64  22-Jan                                 243 non-null    int64  
 65  22-Feb                                 243 non-null    float64
 66  22-Mar                                 243 non-null    int64  
 67  22-Apr                                 243 non-null    int64  
 68  22-May                                 243 non-null    float64
 69  22-Jun                                 243 non-null    int64  
 70  22-Jul                                 243 non-null    int64  
 71  22-Aug                                 243 non-null    float64
 72  22-Sep                                 243 non-null    int64  
 73  22-Oct                                 243 non-null    int64  
 74  22-Nov                                 243 non-null    float64
 75  22-Dec                                 243 non-null    int64  
 76  23-Jan                                 243 non-null    float64
 77  23-Feb                                 243 non-null    int64  
 78  23-Mar                                 243 non-null    float64
 79  23-Apr                                 243 non-null    int64  
 80  23-May                                 243 non-null    float64
 81  23-Jun                                 243 non-null    int64  
 82  23-Jul                                 243 non-null    int64  
 83  23-Aug                                 243 non-null    float64
 84  23-Sep                                 243 non-null    int64  
 85  23-Oct                                 243 non-null    float64
 86  23-Nov                                 243 non-null    float64
 87  23-Dec                                 243 non-null    int64  
 88  24-Jan                                 243 non-null    int64  
 89  24-Feb                                 243 non-null    int64  
 90  24-Mar                                 243 non-null    int64  
 91  24-Apr                                 243 non-null    float64
 92  24-May                                 243 non-null    int64  
 93  24-Jun                                 243 non-null    int64  
 94  24-Jul                                 243 non-null    int64  
 95  24-Aug                                 243 non-null    int64  
dtypes: float64(31), int64(61), object(4)
memory usage: 182.4+ KB
In [32]:
new_df.head(7)
Out[32]:
Region SOC 2 digit code SOC 2 digit label Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands 11 Corporate managers and directors high 2315 1955.0 2380 1950.0 2210.0 2080 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands 12 Other managers and proprietors low 1420 1240.0 1465 1250.0 1475.0 1555 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands 21 Science, research, engineering and technology ... low 5010 4165.0 5170 4775.0 5130.0 4580 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands 22 Health professionals high 2565 2020.0 2670 2515.0 2890.0 2655 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands 23 Teaching and other educational professionals low 2075 1850.0 2160 1670.0 1940.0 2200 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560
5 East Midlands 24 Business, media and public service professionals NaN 4005 3540.0 4215 3745.0 4155.0 4365 ... 2950.0 2745 3635 2800 2585 2935.0 3020 2530 2725 2485
6 East Midlands 31 Science, engineering and technology associate ... NaN 2215 2125.0 2560 2070.0 2320.0 2150 ... 1570.0 1590 2025 1630 1655 1845.0 1965 1840 1850 1745

7 rows × 96 columns

In [33]:
# Find unique values and their count in the 4th column (index 3)
unique_values = new_df.iloc[:, 3].unique()
unique_count = new_df.iloc[:, 3].nunique()

print(f"Unique values: {unique_values}")
print(f"Number of unique values: {unique_count}")
Unique values: ['high' 'low' nan]
Number of unique values: 2
In [ ]:
 
In [34]:
# Find unique values and their count in the 2nd column (index 1)
unique_values = new_df.iloc[:, 1].unique()
unique_count = new_df.iloc[:, 1].nunique()

print(f"Unique values: {unique_values}")
print(f"Number of unique values: {unique_count}")
Unique values: ['11' '12' '21' '22' '23' '24' '31' '32' '33' '34' '35' '41' '42' '51'
 '52' '53' '54' '61' '62' '63' '71' '72' '81' '82' '91' '92' 'Unknown']
Number of unique values: 27
In [35]:
#Replacing missing values in Prioritise using SOC 2 as high or low with 'unknown'
row_index = 3  # Replace this with the desired row index
new_df['Prioritise using SOC 2 as high or low'] = new_df['Prioritise using SOC 2 as high or low'].fillna('unknown')
In [ ]:
 
In [36]:
new_df.head(7)
Out[36]:
Region SOC 2 digit code SOC 2 digit label Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands 11 Corporate managers and directors high 2315 1955.0 2380 1950.0 2210.0 2080 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands 12 Other managers and proprietors low 1420 1240.0 1465 1250.0 1475.0 1555 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands 21 Science, research, engineering and technology ... low 5010 4165.0 5170 4775.0 5130.0 4580 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands 22 Health professionals high 2565 2020.0 2670 2515.0 2890.0 2655 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands 23 Teaching and other educational professionals low 2075 1850.0 2160 1670.0 1940.0 2200 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560
5 East Midlands 24 Business, media and public service professionals unknown 4005 3540.0 4215 3745.0 4155.0 4365 ... 2950.0 2745 3635 2800 2585 2935.0 3020 2530 2725 2485
6 East Midlands 31 Science, engineering and technology associate ... unknown 2215 2125.0 2560 2070.0 2320.0 2150 ... 1570.0 1590 2025 1630 1655 1845.0 1965 1840 1850 1745

7 rows × 96 columns

In [37]:
#Dropping column B and C and saving it to a new dataframe
new_data_II = new_df.drop(new_df.columns[[1, 2]], axis=1)
new_data_II.to_csv('new_dataii.csv', index=False)
In [38]:
df_II = pd.read_csv('new_dataii.csv')
In [39]:
df_II.head(7)
Out[39]:
Region Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun 17-Jul 17-Aug ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands high 2315 1955.0 2380 1950.0 2210.0 2080 1665 1880.0 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands low 1420 1240.0 1465 1250.0 1475.0 1555 1130 1145.0 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands low 5010 4165.0 5170 4775.0 5130.0 4580 3730 3975.0 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands high 2565 2020.0 2670 2515.0 2890.0 2655 2315 2415.0 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands low 2075 1850.0 2160 1670.0 1940.0 2200 1225 1320.0 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560
5 East Midlands unknown 4005 3540.0 4215 3745.0 4155.0 4365 3190 3440.0 ... 2950.0 2745 3635 2800 2585 2935.0 3020 2530 2725 2485
6 East Midlands unknown 2215 2125.0 2560 2070.0 2320.0 2150 1850 1870.0 ... 1570.0 1590 2025 1630 1655 1845.0 1965 1840 1850 1745

7 rows × 94 columns

In [40]:
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
In [41]:
#encoding regions column
df = pd.read_csv('new_dataii.csv')

# One-hot encode the 'Region' column
encoder = ce.OneHotEncoder(cols='Region', handle_unknown='return_nan', return_df=True, use_cat_names=True)
region_encoded = encoder.fit_transform(df[['Region']])

# Drop the original 'Region' column
df_dropped = df.drop(columns=['Region'])

# Combine the one-hot encoded columns and the remaining dataset, placing encoded columns first
df_updated = pd.concat([region_encoded, df_dropped], axis=1)

# Save the updated dataset to a new CSV file
df_updated.to_csv('updated_dataset_hopefully.csv', index=False)

print("The 'Region' column has been replaced with one-hot encoded columns at the start of the dataset.")
The 'Region' column has been replaced with one-hot encoded columns at the start of the dataset.
In [42]:
papa = pd.read_csv('updated_dataset_hopefully.csv')
In [43]:
papa.head()
Out[43]:
Region_East Midlands Region_East of England Region_London Region_North East Region_North West Region_South East Region_South West Region_West Midlands Region_Yorkshire and The Humber Prioritise using SOC 2 as high or low ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 high ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 low ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 low ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 high ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 low ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560

5 rows × 102 columns

In [ ]:
 
In [44]:
#encoding Prioritise using SOC 2 as high or low columns
df = pd.read_csv('updated_dataset_hopefully.csv')

# One-hot encode the 'Prioritise using SOC 2 as high or low' column
encoder = ce.OneHotEncoder(cols='Prioritise using SOC 2 as high or low', handle_unknown='return_nan', return_df=True, use_cat_names=True)
region_encoded = encoder.fit_transform(df[['Prioritise using SOC 2 as high or low']])

# Drop the original 'Region' column
df_dropped = df.drop(columns=['Prioritise using SOC 2 as high or low'])

# Combine the one-hot encoded columns and the remaining dataset, placing encoded columns first
df_updated = pd.concat([region_encoded, df_dropped], axis=1)

# Save the updated dataset to a new CSV file
df_updated.to_csv('updated_data_hopefully.csv', index=False)

print("The 'Prioritise using SOC 2 as high or low' column has been replaced with one-hot encoded columns at the start of the dataset.")
The 'Prioritise using SOC 2 as high or low' column has been replaced with one-hot encoded columns at the start of the dataset.
In [45]:
papa_kay = pd.read_csv('updated_data_hopefully.csv')
papa_kay.head()
Out[45]:
Prioritise using SOC 2 as high or low_high Prioritise using SOC 2 as high or low_low Prioritise using SOC 2 as high or low_unknown Region_East Midlands Region_East of England Region_London Region_North East Region_North West Region_South East Region_South West ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560

5 rows × 104 columns

NORMALIZATION USING ROBUST SCALER

In [46]:
#normalization begins here 
from sklearn.preprocessing import RobustScaler

ml_data = pd.read_csv('updated_data_hopefully.csv') 

onehot_cols = [col for col in df.columns if col.startswith(('Region_','Prioritise using SOC 2 as high or low_'))]

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# We want to scale only those numeric columns that are not part of the one-hot encoded ones.
cols_to_scale = [col for col in numeric_cols if col not in onehot_cols]

# Instantiate the RobustScaler
scaler = RobustScaler()

# Apply robust scaling to the selected numerical columns
ml_data[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])



#save normalized data
ml_data.to_csv('normalized_dataset_one.csv', index=False)

# Optionally, check the first few rows to see if the scaling worked
ml_data.head()
Out[46]:
Prioritise using SOC 2 as high or low_high Prioritise using SOC 2 as high or low_low Prioritise using SOC 2 as high or low_unknown Region_East Midlands Region_East of England Region_London Region_North East Region_North West Region_South East Region_South West ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... -0.045455 0.043651 -0.006079 -0.039042 -0.037534 -0.097638 -0.134417 -0.164243 -0.120032 -0.132127
1 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... -0.223562 -0.148810 -0.174772 -0.188110 -0.212690 -0.234646 -0.261448 -0.260051 -0.300081 -0.229864
2 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.305195 0.438492 0.357143 0.333629 0.309205 0.165354 0.091581 0.061591 0.037307 0.123077
3 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.622449 0.644841 0.458967 0.402839 0.428954 0.348031 0.276219 0.275449 0.248175 0.255204
4 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.028757 0.212302 0.080547 0.141970 0.223414 0.096063 0.141802 0.046193 -0.082725 -0.133937

5 rows × 104 columns

TASK 4: USING RANDOM FOREST REGRESSOR

In [47]:
#Random Forest Regressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
In [48]:
# Load the preprocessed (normalized) dataset that already contains one-hot encoded columns.
df_random = pd.read_csv('normalized_dataset_one.csv')
df_random.head()
Out[48]:
Prioritise using SOC 2 as high or low_high Prioritise using SOC 2 as high or low_low Prioritise using SOC 2 as high or low_unknown Region_East Midlands Region_East of England Region_London Region_North East Region_North West Region_South East Region_South West ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... -0.045455 0.043651 -0.006079 -0.039042 -0.037534 -0.097638 -0.134417 -0.164243 -0.120032 -0.132127
1 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... -0.223562 -0.148810 -0.174772 -0.188110 -0.212690 -0.234646 -0.261448 -0.260051 -0.300081 -0.229864
2 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.305195 0.438492 0.357143 0.333629 0.309205 0.165354 0.091581 0.061591 0.037307 0.123077
3 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.622449 0.644841 0.458967 0.402839 0.428954 0.348031 0.276219 0.275449 0.248175 0.255204
4 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.028757 0.212302 0.080547 0.141970 0.223414 0.096063 0.141802 0.046193 -0.082725 -0.133937

5 rows × 104 columns

In [49]:
# Load the preprocessed (normalized) dataset that already contains one-hot encoded columns.
df_random = pd.read_csv('normalized_dataset_one.csv')

# Define the target column.
target_col = "24-Aug"
if target_col not in df.columns:
    raise ValueError(f"Target column '{target_col}' not found in the dataset.")

# Separate features (X) and target (y).
X = df_random.drop(columns=[target_col])
y = df_random[target_col]

# Perform a train-validate-test split.
# First, split the data into training and temporary sets (e.g., 60% train, 40% temp).
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42)

# Then split the temporary set equally into validation and test sets (each 20% of the original data).
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

print("Dataset sizes:")
print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)
Dataset sizes:
Training set: (145, 103) (145,)
Validation set: (49, 103) (49,)
Test set: (49, 103) (49,)
In [50]:
# Initialize the RandomForestRegressor.
rfr = RandomForestRegressor(
    n_estimators=100,         # Number of trees
    random_state=42,          # Random seed for reproducibility
    n_jobs=-1,                # Use all available processors
    max_depth=None,             # Maximum depth of the trees (adjust as needed)
    min_samples_split=2,     # Minimum number of samples required to split a node
    min_samples_leaf=1        # Minimum number of samples required to be at a leaf node
)

# Train the model on the training set.
rfr.fit(X_train, y_train)

# Predict on the validation set.
y_val_pred = rfr.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE: {:.4f}".format(val_rmse))

# Once you are happy with hyperparameters based on the validation set,
# you can evaluate your final model on the test set.
y_test_pred = rfr.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("Test RMSE: {:.4f}".format(test_rmse))
Validation RMSE: 0.2132
Test RMSE: 0.0960
In [51]:
# Create a DataFrame to compare predicted vs actual values
y_test_pred = rfr.predict(X_test)
comparison_df = pd.DataFrame({
    "Actual Values (Test Data)": y_test.values,
    "Predicted Values (Test Data)": y_test_pred
})

# Display the first few rows for validation
print("Comparison of Actual vs Predicted Values for 'Aug-24 Advert Count':")
print(comparison_df.head())  # Show only a few rows

# Optionally save the comparison to a CSV file
comparison_df.to_csv('comparison_aug_24.csv', index=False)
print("Comparison saved to 'comparison_aug_24.csv'.")
Comparison of Actual vs Predicted Values for 'Aug-24 Advert Count':
   Actual Values (Test Data)  Predicted Values (Test Data)
0                   0.743891                      0.816489
1                  -0.275113                     -0.181792
2                  -0.209955                     -0.259041
3                   0.128507                      0.143946
4                   1.084163                      1.105213
Comparison saved to 'comparison_aug_24.csv'.
In [52]:
# Load the comparison CSV file
comparison_df = pd.read_csv('comparison_aug_24.csv')

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(comparison_df.index, comparison_df['Actual Values (Test Data)'], label='Actual Values', marker='o', linestyle='--', color='blue')
plt.plot(comparison_df.index, comparison_df['Predicted Values (Test Data)'], label='Predicted Values', marker='x', linestyle='-', color='orange')

# Add labels and title
plt.title('Actual vs Predicted Values for Aug-24 Advert Count', fontsize=14)
plt.xlabel('Index of Test Data Points', fontsize=12)
plt.ylabel('Advert Count (Normalized)', fontsize=12)
plt.legend(loc='best', fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig('Actual vs predicted 2024.png', format='png', bbox_inches='tight')
# Show the plot
plt.show()
No description has been provided for this image
In [53]:
from sklearn.metrics import r2_score
y_train_pred = rfr.predict(X_train)
# Compute R² scores
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Validation R²: {:.4f}".format(val_r2))
print("Test R²: {:.4f}".format(test_r2))
print("Training R²: {:.4f}".format(r2_train))
Validation R²: 0.9307
Test R²: 0.9767
Training R²: 0.9957
In [54]:
#overfitting check
y_train_pred = rfr.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("Training RMSE: {:.4f}".format(train_rmse))
Training RMSE: 0.0609
In [55]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# Predict on test data
y_test_pred = rfr.predict(X_test)

# Calculate R² for test data
r2_test = r2_score(y_test, y_test_pred)

# Calculate RMSE for test data
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Calculate MAE for test data
mae_test = mean_absolute_error(y_test, y_test_pred)



# Create the report in a dictionary format
results = {
    "R²": r2_test,
    "RMSE": rmse_test,
    "MAE": mae_test,
}

# Print out the results (you can store these in a file or display them as needed)
print("Model Performance Report:")
print(f"R² (Test): {r2_test:.4f}")
print(f"RMSE (Test): {rmse_test:.4f}")
print(f"MAE (Test): {mae_test:.4f}")
Model Performance Report:
R² (Test): 0.9767
RMSE (Test): 0.0960
MAE (Test): 0.0596
In [56]:
import matplotlib.pyplot as plt

# Plotting predicted vs actual values
plt.scatter(y_test, y_test_pred, color='blue', label="Predicted vs Actual")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label="Perfect Prediction")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Prediction vs Actual values for Regression")
plt.legend()
plt.savefig('Predicted vs actual scatterplot.png', format='png', bbox_inches='tight')
plt.show()
No description has been provided for this image

ALGOTITHM FOR CLASSIFICATION MODEL

In [104]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.inspection import permutation_importance
In [105]:
svm_one = pd.read_csv('new_data_one.csv')
svm_one.head(7)
Out[105]:
Region SOC 2 digit code SOC 2 digit label Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands 11 Corporate managers and directors high 2315 1955.0 2380 1950.0 2210.0 2080 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands 12 Other managers and proprietors low 1420 1240.0 1465 1250.0 1475.0 1555 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands 21 Science, research, engineering and technology ... low 5010 4165.0 5170 4775.0 5130.0 4580 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands 22 Health professionals high 2565 2020.0 2670 2515.0 2890.0 2655 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands 23 Teaching and other educational professionals low 2075 1850.0 2160 1670.0 1940.0 2200 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560
5 East Midlands 24 Business, media and public service professionals NaN 4005 3540.0 4215 3745.0 4155.0 4365 ... 2950.0 2745 3635 2800 2585 2935.0 3020 2530 2725 2485
6 East Midlands 31 Science, engineering and technology associate ... NaN 2215 2125.0 2560 2070.0 2320.0 2150 ... 1570.0 1590 2025 1630 1655 1845.0 1965 1840 1850 1745

7 rows × 96 columns

In [106]:
svm_one_clean = svm_one.drop(svm_one.columns[1:3], axis=1)
svm_one_clean
Out[106]:
Region Prioritise using SOC 2 as high or low 17-Jan 17-Feb 17-Mar 17-Apr 17-May 17-Jun 17-Jul 17-Aug ... 23-Nov 23-Dec 24-Jan 24-Feb 24-Mar 24-Apr 24-May 24-Jun 24-Jul 24-Aug
0 East Midlands high 2315 1955.0 2380 1950.0 2210.0 2080 1665 1880.0 ... 1500.0 1400 1850 1525 1540 1785.0 1870 1610 1830 1565
1 East Midlands low 1420 1240.0 1465 1250.0 1475.0 1555 1130 1145.0 ... 1020.0 915 1295 1105 1050 1350.0 1440 1330 1275 1295
2 East Midlands low 5010 4165.0 5170 4775.0 5130.0 4580 3730 3975.0 ... 2445.0 2395 3045 2575 2510 2620.0 2635 2270 2315 2270
3 East Midlands high 2565 2020.0 2670 2515.0 2890.0 2655 2315 2415.0 ... 3300.0 2915 3380 2770 2845 3200.0 3260 2895 2965 2635
4 East Midlands low 2075 1850.0 2160 1670.0 1940.0 2200 1225 1320.0 ... 1700.0 1825 2135 2035 2270 2400.0 2805 2225 1945 1560
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
238 Yorkshire and The Humber low 1635 1600.0 2190 1985.0 2125.0 2175 1775 1865.0 ... 1385.0 1280 1885 1570 1620 2160.0 2180 2120 2295 2255
239 Yorkshire and The Humber high 1435 1280.0 1735 1685.0 1870.0 2290 1840 1905.0 ... 1190.0 845 1265 1200 1145 1850.0 2345 2385 2575 2480
240 Yorkshire and The Humber low 490 590.0 845 660.0 855.0 845 815 810.0 ... 375.0 340 555 425 465 740.0 855 775 910 875
241 Yorkshire and The Humber high 2610 2960.0 3825 3680.0 3945.0 5215 3760 3150.0 ... 2840.0 2550 3955 3590 4005 4935.0 4985 4600 4910 4875
242 Yorkshire and The Humber high 85 65.0 85 45.0 70.0 55 55 65.0 ... 35.0 35 45 35 30 15.0 35 25 40 25

243 rows × 94 columns

In [ ]:
 
In [107]:
#drop rows with missing values for target y variable
new_svm_cleaned = svm_one_clean.dropna()
In [108]:
# Select features and target variable
X = new_svm_cleaned.iloc[:, 2:]  
y = new_svm_cleaned.iloc[:, 1]  
In [109]:
# Normalize numerical columns
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)  # Apply directly
In [110]:
# Encode categorical target variable
le = LabelEncoder()
y = le.fit_transform(y)
In [111]:
# Check class distribution
print("Class distribution:", np.bincount(y))
Class distribution: [104 110]
In [112]:
# Split the data (75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42,)
In [113]:
# Initialize and train the SVM classifier
svm = SVC(kernel='rbf',C=1.0, gamma='scale',class_weight='balanced', random_state=42)  # Use class weights
svm.fit(X_train, y_train)
Out[113]:
SVC(class_weight='balanced', random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(class_weight='balanced', random_state=42)
In [118]:
# Evaluate on test set
y_test_pred = svm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
Test Accuracy: 0.5370
In [124]:
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
Test Accuracy: 0.5370
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.22      0.32        27
           1       0.52      0.85      0.65        27

    accuracy                           0.54        54
   macro avg       0.56      0.54      0.49        54
weighted avg       0.56      0.54      0.49        54

In [120]:
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix:
[[ 6 21]
 [ 4 23]]
In [121]:
from sklearn.inspection import permutation_importance
In [122]:
# Compute permutation importance
result = permutation_importance(svm, X_test, y_test, scoring="accuracy", random_state=42)
# Get feature names (assuming X was a DataFrame before scaling)
feature_names = new_svm_cleaned.columns[2:]  # Exclude Region and Target column
# Sort the importance scores in descending order and select top 10
sorted_idx = np.argsort(result.importances_mean)[-10:]
In [123]:
# Plot Top 10 Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(np.array(feature_names)[sorted_idx], result.importances_mean[sorted_idx], color="skyblue")
plt.xlabel("Permutation Importance Score")
plt.ylabel("Features")
plt.title("Top 10 Permutation Feature Importance in SVM")
plt.grid(axis='x', linestyle='--', alpha=0.7)

plt.savefig('Feature importance.png', format='png', bbox_inches='tight')

plt.show()
No description has been provided for this image
In [ ]: